This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Scrape tweets for the hashtag #datascience and do the following
- Hour wise or minute wise (depending upon the scrapped tweets) number of tweets - draw a line chart
- Top ten users with more number of tweets
- Create a new column in the data itself, to identity total number of hashtags in each tweet
- Identity those users who have used #datascience as well as #machinelearning. Plot a bar chart top 10 users based on their count
rm(list=ls())
library(tm)
library(dplyr)
library(stringi)
library(stringr)
library(wordcloud)
library(ggplot2)
library(plotly)
library(twitteR)
library(ROAuth)
library(httr)
library(base64enc)
library(httk)
library(RCurl)
library(zoo)
serchHashtagTwitter <- function(hashtag) {
api_key <- "XXXX"
api_secret <- "XXXXX"
token <- "XXXXX"
token_secret <- "XXXXXX"
httr::set_config( config( ssl_verifypeer = 0L ) )
setup_twitter_oauth(
consumer_key = api_key,
consumer_secret = api_secret,
access_token = token,
access_secret = token_secret)
# data.science.tweet <- searchTwitter("#datascience", 5000)
data.science.tweet <- searchTwitter(hashtag, 5000)
## Convert the tweets to Dataframe
#df.data.science.tweet <- twListToDF(data.science.tweet)
twListToDF(data.science.tweet)
}
readCSVFile <- function(tweetfile.Path){
df <- read.csv(
tweetfile.Path,
stringsAsFactors = F)
#View(df)
df$created <- strptime(df$created,format="%Y-%m-%d %H:%M:%S", tz="GMT")
df$created <- as.POSIXct(df$created)
df
}
Using ‘searchTwitter’ to get the twitter for #datascience
## Search the tweets with hashtag '#datascience'
# df.data.science.tweet <- serchHashtagTwitter("#datascience")
## Read twritter fron CSV file
df.data.science.tweet <- readCSVFile("/Users/ritesh/pad-datascience/R/unstructureData/data/datascience_tweet.csv")
#head(df.data.science.tweet)
#View(df.data.science.tweet)
dim(df.data.science.tweet)
## [1] 5000 17
# write.csv(df.data.science.tweet, file = "/Users/ritesh/pad-datascience/R/unstructureData/data/datascience_tweet.csv")
Hour wise or minute wise (depending upon the scrapped tweets) number of tweets - draw a line chart
## Convert the text column to character
df.data.science.tweet$text <- as.character(df.data.science.tweet$text)
## removed the spacial character excluding '_' and '#'
df.data.science.tweet$text_transformed <- gsub("[^A-Za-z0-9///'/#/_ ]", " ", df.data.science.tweet$text)
## convert the the transformed text column to lower case
df.data.science.tweet$text_transformed <- lapply(df.data.science.tweet$text_transformed, tolower)
## Parse the date field 'created'
names(df.data.science.tweet)
## [1] "X" "text" "favorited"
## [4] "favoriteCount" "replyToSN" "created"
## [7] "truncated" "replyToSID" "id"
## [10] "replyToUID" "statusSource" "screenName"
## [13] "retweetCount" "isRetweet" "retweeted"
## [16] "longitude" "latitude" "text_transformed"
class(df.data.science.tweet$created)
## [1] "POSIXct" "POSIXt"
## Create a new date column which include date, hour and minute to plot the minute wise tweet count
df.data.science.tweet$cnvdate <- format(df.data.science.tweet$created, "%d/%m/%y %H:%M")
class(df.data.science.tweet$cnvdate)
## [1] "character"
# View(df.data.science.tweet)
## calculate the minute wise tweetcount
tweets_minutes <- df.data.science.tweet %>% group_by(cnvdate) %>% summarise(count=n())
tweets_minutes_subset <- head(tweets_minutes, 40)
minutesPlot <-
ggplot(data=tweets_minutes_subset, aes(x=cnvdate, y=count, group=1)) +
geom_line() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Tweets Count Vs Day", subtitle="Hour Wise Tweet Count") +
xlab("\nDay Wise Hours\n") +
ylab("\nTweet Count")
# ## Dynamic line chart using 'plotly'
# ggplotly(minutesPlot)
## Static line chart
plot(minutesPlot)
Top ten users with more number of tweets
length(df.data.science.tweet$screenName)
## [1] 5000
length(unique(df.data.science.tweet$screenName))
## [1] 1932
## Group the data based on user and calculate the count for uses tweets
tweets_user <- df.data.science.tweet %>% group_by(screenName) %>% summarise(count=n())
tweets_user <- tweets_user %>% arrange(-count)
class(tweets_user)
## [1] "tbl_df" "tbl" "data.frame"
#View(tweets_user)
top_10 <- head(tweets_user, 10)
top_10
## # A tibble: 10 × 2
## screenName count
## <chr> <int>
## 1 alevergara78 369
## 2 chidambara09 126
## 3 fernandocuenca 99
## 4 ArkangelScrap 85
## 5 gp_pulipaka 73
## 6 MeeTMercurio 55
## 7 StartUpRealTime 54
## 8 IoTimelab 49
## 9 iamOlAOYE 42
## 10 DaveRubal 39
Create a new column in the data itself, to identity total number of hashtags in each tweet
## Helper function for searching a pattern from a string
strcount <- function(x, pattern, split){
unlist(lapply(
strsplit(x, split),
function(z) na.omit(length(grep(pattern, z)))
))
}
## Create new column 'hashTagCount' containg the count of hastags corresponding to each tweet
df.data.science.tweet$hashTagCount <-
lapply(df.data.science.tweet$text_transformed, FUN = function(x2) strcount(x2, "#\\S+", " "))
#unique(df.data.science.tweet$hashTagCount)
Identity those users who have used #datascience as well as #machinelearning. Plot a bar chart top 10 users based on their count
df.data.science.tweet$mlHashTagCount <-
lapply(
df.data.science.tweet$text_transformed,
FUN = function(x2) strcount(x2, "#machinelearning", " "))
df.data.science.tweet$dsHashTagCount <-
lapply(
df.data.science.tweet$text_transformed,
FUN = function(x2) strcount(x2, "#datascience", " "))
df.data.science.tweet$dsHashTagCount <- as.numeric(df.data.science.tweet$dsHashTagCount)
df.data.science.tweet$mlHashTagCount <- as.numeric(df.data.science.tweet$mlHashTagCount)
## Add a column to mark if tweets have both '#datascience' and '#machinelearnng'
df.data.science.tweet <-
transform(
df.data.science.tweet,
DsMlHashTagCount = ifelse(
(df.data.science.tweet$dsHashTagCount > 0 & df.data.science.tweet$mlHashTagCount > 0), 1, 0))
#unique(df.data.science.tweet$DsMlHashTagCount)
tweets_user_ds_ml <-
df.data.science.tweet %>%
group_by(screenName) %>%
summarise(totalCount = sum(DsMlHashTagCount, na.rm = T))
tweets_user_ds_ml <- tweets_user_ds_ml %>% arrange(-totalCount)
top_10_users <- head(tweets_user_ds_ml, 10)
## Bar plot of top 10 users have used '#datascience' and '#machinelearning'
bar_chart_tweet_count <-
ggplot(top_10_users, aes(x = reorder(screenName,-totalCount), y = totalCount)) +
geom_bar(stat = 'identity') +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Users Vs Tweet Count", subtitle="Users Vs Tweet Count") +
xlab("\nMonth - Year") +
ylab("\nTweet Count")
ggplotly(bar_chart_tweet_count)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
plot(bar_chart_tweet_count)
Scrape tweets from @narendramodi and do the following
- Create a word cloud using the hashtags used in his tweets. (If you are not able to scrape more tweets, please use the data set which was shared already)
- For each quarter identify top 5 hashtags based on frequency. Represent them using bar charts (Tip: use facets)
modi.tweets <-
read.csv(
"/Users/ritesh/pad-datascience/R/unstructureData/data/narendramodi_tweets.csv",
stringsAsFactors = F)
## Convert the text column to character
modi.tweets$text <- as.character(modi.tweets$text)
## removed the spacial character excluding '_' and '#'
modi.tweets$text_transformed <- gsub("[^A-Za-z0-9///'/#/_ ]", " ", modi.tweets$text)
# View(modi.tweets)
Create a word cloud using the hashtags used in his tweets. (If you are not able to scrape more tweets, please use the data set which was shared already)
words_list <- str_split(modi.tweets$text_transformed, " ")
words <- unlist(words_list)
freq_words <- data.frame(table(words))
df.hashtag_words <- freq_words[grepl('#\\S+', as.character(freq_words$words)), ]
df.hashtag_words <- df.hashtag_words %>% arrange(-Freq)
top_20 <- df.hashtag_words
## Word cloud
wordcloud(
top_20$words,
top_20$Freq,
scale=c(4,.5),
random.order = T,
random.color = T,
rot.per=.2,
vfont=c("sans serif","plain"),
colors=palette())
## Warning in wordcloud(top_20$words, top_20$Freq, scale = c(4, 0.5),
## random.order = T, : #Sandesh2Soldiers could not be fit on page. It will not
## be plotted.
For each quarter identify top 5 hashtags based on frequency. Represent them using bar charts (Tip: use facets)
# View(modi.tweets)
## add quater as a seperate column
modi.tweets$qtr <- as.yearqtr(modi.tweets$created_at, format = "%Y-%m-%d")
#unique(modi.tweets$qtr)
#split to get words for each tweets
modi.tweets$words_list <- str_split(modi.tweets$text_transformed, " ")
# View(modi.tweets)
## Group the dataframe by quarter and combine the bag of words as list of vector of words
quarterly.GrpWord.List <- modi.tweets %>%
group_by(qtr) %>%
summarise(words = list(words_list))
# View(quarterly.GrpWord.List)
## Unlist the list of words and filter out only hashtags and add that in the 'word' column
quarterly.GrpWord.List$words <-
lapply(lapply(quarterly.GrpWord.List$words, unlist), FUN = function(x2) x2[grepl('#\\S+', as.character(x2))])
## Convert the quarter column type from 'yearqtr' to 'character'
class(quarterly.GrpWord.List$qtr)
## [1] "yearqtr"
quarterly.GrpWord.List$qtr <- as.character(quarterly.GrpWord.List$qtr)
## Create a matrix of dataframe containing words and with its count for each quarter
grp.df.matrix <-
mapply(
FUN = function (x,y) {df <- data.frame(table(x)); df$qtr <- y; df},
quarterly.GrpWord.List$words, quarterly.GrpWord.List$qtr)
## create a list dataframes of top five hashtags for each quarters
grp.df.list <- lapply(
apply(grp.df.matrix,2, data.frame),
FUN = function(dflistElement) {df <- dflistElement %>% arrange(-Freq); head(df,5)})
## Combine all dataframe using rbind
top5.grp.df <- do.call("rbind", grp.df.list)
# View(top5.grp.df)
bar_chart_qtr_tweet_count <-
ggplot(top5.grp.df, aes(x = reorder(x,-Freq), y = Freq)) +
geom_bar(stat = 'identity') +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Quarterly Top 5 Hashtag Count", subtitle="Users Vs Tweet Count") +
xlab("\n Top-5 - Hashtags") +
ylab("\n Hashtag Count") +
facet_wrap(~qtr, ncol = 5, scales = "free")
# ggplotly(bar_chart_qtr_tweet_count)
plot(bar_chart_qtr_tweet_count)
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.